# Import pandas library for data manipulation and analysis
import pandas as pd
# Import numpy library for numerical operations
import numpy as np
# Import matplotlib.pyplot for creating visualizations and plots
import matplotlib.pyplot as plt
# Import seaborn library for statistical data visualization
import seaborn as sns
# Import numpy library for numerical operations (note: this is a duplicate import)
import numpy as np
# Import statistical functions from scipy
from scipy import stats
# Import statsmodels for statistical models and tests
import statsmodels.api as sm
# Import ticker module from matplotlib for customizing axis tick labels and formatting
import matplotlib.ticker as mticker

# Load the world export-import dataset from a CSV file into a pandas DataFrame
dts_export_import = pd.read_csv('34_years_world_export_import_dataset.csv')
dts_export_import

# Get the dimensions of the dataset (number of rows and columns)
dts_export_import.shape

(8096, 33)

# Print information about the dataframe including data types and non-null values
print(dts_export_import.info())
# Generate descriptive statistics of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)
print(dts_export_import.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8096 entries, 0 to 8095
Data columns (total 33 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Partner Name                             8096 non-null   object 
 1   Year                                     8096 non-null   int64  
 2   Export (US$ Thousand)                    8096 non-null   float64
 3   Import (US$ Thousand)                    8096 non-null   float64
 4   Export Product Share (%)                 8076 non-null   float64
 5   Import Product Share (%)                 8096 non-null   int64  
 6   Revealed comparative advantage           4712 non-null   float64
 7   World Growth (%)                         4410 non-null   float64
 8   Country Growth (%)                       4410 non-null   float64
 9   AHS Simple Average (%)                   8080 non-null   float64
 10  AHS Weighted Average (%)                 8080 non-null   float64
 11  AHS Total Tariff Lines                   8080 non-null   float64
 12  AHS Dutiable Tariff Lines Share (%)      8080 non-null   float64
 13  AHS Duty Free Tariff Lines Share (%)     8080 non-null   float64
 14  AHS Specific Tariff Lines Share (%)      8080 non-null   float64
 15  AHS AVE Tariff Lines Share (%)           8080 non-null   float64
 16  AHS MaxRate (%)                          8080 non-null   float64
 17  AHS MinRate (%)                          8080 non-null   float64
 18  AHS SpecificDuty Imports (US$ Thousand)  8081 non-null   float64
 19  AHS Dutiable Imports (US$ Thousand)      8081 non-null   float64
 20  AHS Duty Free Imports (US$ Thousand)     8081 non-null   float64
 21  MFN Simple Average (%)                   8081 non-null   float64
 22  MFN Weighted Average (%)                 8081 non-null   float64
 23  MFN Total Tariff Lines                   8081 non-null   float64
 24  MFN Dutiable Tariff Lines Share (%)      8081 non-null   float64
 25  MFN Duty Free Tariff Lines Share (%)     8081 non-null   float64
 26  MFN Specific Tariff Lines Share (%)      8080 non-null   float64
 27  MFN AVE Tariff Lines Share (%)           8080 non-null   float64
 28  MFN MaxRate (%)                          8081 non-null   float64
 29  MFN MinRate (%)                          8081 non-null   float64
 30  MFN SpecificDuty Imports (US$ Thousand)  8081 non-null   float64
 31  MFN Dutiable Imports (US$ Thousand)      8081 non-null   float64
 32  MFN Duty Free Imports (US$ Thousand)     8081 non-null   float64
dtypes: float64(30), int64(2), object(1)
memory usage: 2.0+ MB
None
              Year  Export (US$ Thousand)  Import (US$ Thousand)  \
count  8096.000000           8.096000e+03           8.096000e+03   
mean   2004.908226           1.421192e+08           1.305216e+08   
std       9.707831           9.928417e+08           9.073802e+08   
min    1988.000000           0.000000e+00           3.000000e-02   
25%    1997.000000           4.274264e+05           1.601335e+05   
50%    2005.000000           3.719683e+06           2.053967e+06   
75%    2013.000000           2.585514e+07           2.102937e+07   
max    2021.000000           2.422743e+10           2.193121e+10   

       Export Product Share (%)  Import Product Share (%)  \
count                    8076.0                    8096.0   
mean                      100.0                     100.0   
std                         0.0                       0.0   
min                       100.0                     100.0   
25%                       100.0                     100.0   
50%                       100.0                     100.0   
75%                       100.0                     100.0   
max                       100.0                     100.0   

       Revealed comparative advantage  World Growth (%)  Country Growth (%)  \
count                          4712.0       4410.000000         4410.000000   
mean                              1.0          3.986016            3.986016   
std                               0.0         10.004221           10.004221   
min                               1.0        -62.280000          -62.280000   
25%                               1.0         -1.437500           -1.437500   
50%                               1.0          3.830000            3.830000   
75%                               1.0          9.407500            9.407500   
max                               1.0        174.000000          174.000000   

       AHS Simple Average (%)  AHS Weighted Average (%)  ...  \
count             8080.000000               8080.000000  ...   
mean                 6.656053                  6.082833  ...   
std                  3.837668                  6.765269  ...   
min                  0.000000                  0.000000  ...   
25%                  4.030000                  2.100000  ...   
50%                  6.120000                  4.700000  ...   
75%                  8.480000                  8.050000  ...   
max                 46.980000                197.760000  ...   

       MFN Total Tariff Lines  MFN Dutiable Tariff Lines Share (%)  \
count            8.081000e+03                          8081.000000   
mean             4.291884e+05                            72.722564   
std              4.337735e+05                             5.245883   
min              1.700000e+01                            37.110000   
25%              6.481600e+04                            69.400000   
50%              2.669150e+05                            71.340000   
75%              7.071260e+05                            75.260000   
max              1.538248e+06                            98.210000   

       MFN Duty Free Tariff Lines Share (%)  \
count                           8081.000000   
mean                              20.893574   
std                                4.554328   
min                                1.790000   
25%                               18.320000   
50%                               22.030000   
75%                               24.100000   
max                               62.890000   

       MFN Specific Tariff Lines Share (%)  MFN AVE Tariff Lines Share (%)  \
count                          8080.000000                     8080.000000   
mean                             52.018722                      133.145250   
std                              84.848859                      161.189034   
min                               0.000000                        0.000000   
25%                              10.617500                       33.375000   
50%                              28.180000                       88.050000   
75%                              55.807500                      176.242500   
max                            1800.000000                     3860.000000   

       MFN MaxRate (%)  MFN MinRate (%)  \
count      8081.000000           8081.0   
mean       2710.981771              0.0   
std         840.575331              0.0   
min          25.000000              0.0   
25%        3000.000000              0.0   
50%        3000.000000              0.0   
75%        3000.000000              0.0   
max        5000.000000              0.0   

       MFN SpecificDuty Imports (US$ Thousand)  \
count                             8.081000e+03   
mean                              5.454060e+07   
std                               4.137543e+08   
min                               0.000000e+00   
25%                               4.682219e+04   
50%                               9.012839e+05   
75%                               8.704754e+06   
max                               1.649501e+10   

       MFN Dutiable Imports (US$ Thousand)  \
count                         8.081000e+03   
mean                          1.097283e+08   
std                           7.691260e+08   
min                           9.400000e-01   
25%                           1.785371e+05   
50%                           2.048332e+06   
75%                           1.717688e+07   
max                           1.854956e+10   

       MFN Duty Free Imports (US$ Thousand)  
count                          8.081000e+03  
mean                           1.058794e+05  
std                            4.178933e+06  
min                            0.000000e+00  
25%                            0.000000e+00  
50%                            0.000000e+00  
75%                            0.000000e+00  
max                            3.495531e+08  

[8 rows x 32 columns]

# Calculate the sum of missing values (NaN) for each column in the dts_export_import dataframe
missing_values = dts_export_import.isnull().sum()
# Display the count of missing values for each column
missing_values

Partner Name                                  0
Year                                          0
Export (US$ Thousand)                         0
Import (US$ Thousand)                         0
Export Product Share (%)                     20
Import Product Share (%)                      0
Revealed comparative advantage             3384
World Growth (%)                           3686
Country Growth (%)                         3686
AHS Simple Average (%)                       16
AHS Weighted Average (%)                     16
AHS Total Tariff Lines                       16
AHS Dutiable Tariff Lines Share (%)          16
AHS Duty Free Tariff Lines Share (%)         16
AHS Specific Tariff Lines Share (%)          16
AHS AVE Tariff Lines Share (%)               16
AHS MaxRate (%)                              16
AHS MinRate (%)                              16
AHS SpecificDuty Imports (US$ Thousand)      15
AHS Dutiable Imports (US$ Thousand)          15
AHS Duty Free Imports (US$ Thousand)         15
MFN Simple Average (%)                       15
MFN Weighted Average (%)                     15
MFN Total Tariff Lines                       15
MFN Dutiable Tariff Lines Share (%)          15
MFN Duty Free Tariff Lines Share (%)         15
MFN Specific Tariff Lines Share (%)          16
MFN AVE Tariff Lines Share (%)               16
MFN MaxRate (%)                              15
MFN MinRate (%)                              15
MFN SpecificDuty Imports (US$ Thousand)      15
MFN Dutiable Imports (US$ Thousand)          15
MFN Duty Free Imports (US$ Thousand)         15
dtype: int64

# Calculate the total number of data points in the dataset (rows × columns)
total_dataset = np.product(dts_export_import.shape)
# Count the total number of missing values in the dataset
total_missing = missing_values.sum()
# Calculate the percentage of missing values in the dataset
percent = (total_missing / total_dataset) * 100
# Display the percentage of missing values
percent

4.172281111510361

### Crating Working Dataset

# Create a dataset with the key parameters to review and explore the information
Trade_Dataset = dts_export_import[['Partner Name',                       
'Year',                    
'Export (US$ Thousand)',               
'Import (US$ Thousand)',               
'AHS Simple Average (%)',              
'AHS Weighted Average (%)',            
'AHS Total Tariff Lines',              
'AHS Dutiable Imports (US$ Thousand)', 
'AHS Duty Free Imports (US$ Thousand)',
'MFN Simple Average (%)',              
'MFN Weighted Average (%)',            
'MFN Total Tariff Lines',              
'MFN Dutiable Imports (US$ Thousand)', 
'MFN Duty Free Imports (US$ Thousand)']].copy()

Trade_Dataset['Trade Balance'] = Trade_Dataset['Export (US$ Thousand)'] - Trade_Dataset['Import (US$ Thousand)']
Trade_Dataset['Trade Total'] = Trade_Dataset['Export (US$ Thousand)'] + Trade_Dataset['Import (US$ Thousand)']

Trade_Dataset

# Check the dimensions of the Trade_Dataset DataFrame
Trade_Dataset.shape

(8096, 16)

# Cleaning duplicates
Trade_Dataset = Trade_Dataset.drop_duplicates()
# Check the dimensions of the Trade_Dataset DataFrame
Trade_Dataset.shape

(8096, 16)

# Calculate the sum of missing values (NaN) for each column in the dts_export_import dataframe
missing_values = Trade_Dataset.isnull().sum()
# Display the count of missing values for each column
missing_values

Partner Name                             0
Year                                     0
Export (US$ Thousand)                    0
Import (US$ Thousand)                    0
AHS Simple Average (%)                  16
AHS Weighted Average (%)                16
AHS Total Tariff Lines                  16
AHS Dutiable Imports (US$ Thousand)     15
AHS Duty Free Imports (US$ Thousand)    15
MFN Simple Average (%)                  15
MFN Weighted Average (%)                15
MFN Total Tariff Lines                  15
MFN Dutiable Imports (US$ Thousand)     15
MFN Duty Free Imports (US$ Thousand)    15
Trade Balance                            0
Trade Total                              0
dtype: int64

# Remove rows with missing values (NaN) from the Trade_Dataset dataframe
Trade_Dataset = Trade_Dataset.dropna()

# Calculate the sum of missing values (NaN) for each column in the dts_export_import dataframe
missing_values = Trade_Dataset.isnull().sum()
# Display the count of missing values for each column
missing_values

Partner Name                            0
Year                                    0
Export (US$ Thousand)                   0
Import (US$ Thousand)                   0
AHS Simple Average (%)                  0
AHS Weighted Average (%)                0
AHS Total Tariff Lines                  0
AHS Dutiable Imports (US$ Thousand)     0
AHS Duty Free Imports (US$ Thousand)    0
MFN Simple Average (%)                  0
MFN Weighted Average (%)                0
MFN Total Tariff Lines                  0
MFN Dutiable Imports (US$ Thousand)     0
MFN Duty Free Imports (US$ Thousand)    0
Trade Balance                           0
Trade Total                             0
dtype: int64

# Check the dimensions of the Trade_Dataset DataFrame
Trade_Dataset.shape

(8080, 16)

# Check the data types of all columns in the Trade_Dataset DataFrame
# This helps understand what kind of data each column contains (e.g., integers, floats, strings)
Trade_Dataset.dtypes

Partner Name                             object
Year                                      int64
Export (US$ Thousand)                   float64
Import (US$ Thousand)                   float64
AHS Simple Average (%)                  float64
AHS Weighted Average (%)                float64
AHS Total Tariff Lines                  float64
AHS Dutiable Imports (US$ Thousand)     float64
AHS Duty Free Imports (US$ Thousand)    float64
MFN Simple Average (%)                  float64
MFN Weighted Average (%)                float64
MFN Total Tariff Lines                  float64
MFN Dutiable Imports (US$ Thousand)     float64
MFN Duty Free Imports (US$ Thousand)    float64
Trade Balance                           float64
Trade Total                             float64
dtype: object

# Plotting Export and Import Trends Over Time
filtered_data = Trade_Dataset[Trade_Dataset['Partner Name'] != ' World']
world_data = Trade_Dataset[Trade_Dataset['Partner Name'] == ' World']
yearly_world = world_data.groupby('Year')[['Export (US$ Thousand)', 'Import (US$ Thousand)', 'Trade Balance', 'Trade Total']].sum().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_world['Year'], yearly_world['Export (US$ Thousand)'], label='Total World Imports')
plt.plot(yearly_world['Year'], yearly_world['Import (US$ Thousand)'], label='Total World Imports')
plt.plot(yearly_world['Year'], yearly_world['Trade Balance'], label='Total World Trade Balance')
plt.plot(yearly_world['Year'], yearly_world['Trade Total'], label='Total World Trade')
plt.xlabel('Year')
plt.ylabel('Trade Value (US$)')
plt.title('Global Export and Import Trends (1980-2013)')
plt.legend()
plt.grid(True)
plt.show()

# Top countries by export/import
top_exporters = filtered_data.groupby('Partner Name')['Export (US$ Thousand)'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Exporting Countries (Total over 34 years):")
print(top_exporters)

top_importers = filtered_data.groupby('Partner Name')['Import (US$ Thousand)'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Importing Countries (Total over 34 years):")
print(top_importers)

top_balance = filtered_data.groupby('Partner Name')['Trade Balance'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Trade Balance Countries (Total over 34 years):")
print(top_balance)

top_trade = filtered_data.groupby('Partner Name')['Trade Total'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Trade Total Countries (Total over 34 years):")
print(top_trade)

Top 10 Exporting Countries (Total over 34 years):
Partner Name
Europe & Central Asia         1.500154e+11
East Asia & Pacific           9.443141e+10
North America                 6.530151e+10
United States                 5.495734e+10
China                         2.826862e+10
Germany                       2.329030e+10
Latin America & Caribbean     2.155302e+10
Middle East & North Africa    2.062328e+10
United Kingdom                1.510597e+10
Japan                         1.453045e+10
Name: Export (US$ Thousand), dtype: float64

Top 10 Importing Countries (Total over 34 years):
Partner Name
Europe & Central Asia         1.388943e+11
East Asia & Pacific           1.062229e+11
North America                 4.311681e+10
China                         4.246660e+10
United States                 3.268715e+10
Germany                       2.848006e+10
Latin America & Caribbean     2.125695e+10
Middle East & North Africa    2.009109e+10
Japan                         1.943478e+10
France                        1.321301e+10
Name: Import (US$ Thousand), dtype: float64

Top 10 Trade Balance Countries (Total over 34 years):
Partner Name
United States            2.227019e+10
North America            2.218469e+10
Europe & Central Asia    1.112111e+10
Hong Kong, China         1.038343e+10
Unspecified              7.339505e+09
United Kingdom           4.615490e+09
Switzerland              2.670832e+09
South Asia               2.561655e+09
Turkey                   2.443434e+09
Netherlands              2.096898e+09
Name: Trade Balance, dtype: float64

Top 10 Trade Total Countries (Total over 34 years):
Partner Name
Europe & Central Asia         2.889098e+11
East Asia & Pacific           2.006543e+11
North America                 1.084183e+11
United States                 8.764450e+10
China                         7.073522e+10
Germany                       5.177036e+10
Latin America & Caribbean     4.280997e+10
Middle East & North Africa    4.071438e+10
Japan                         3.396523e+10
France                        2.744588e+10
Name: Trade Total, dtype: float64

# Create a figure with four subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
ax1, ax2, ax3, ax4 = axes.flatten()

# Function to format y-axis ticks in billions (e.g., 2.5B)
def billions(x, pos):
    return f'{x * 1e-9:.1f}B'

# Plot top exporters
top_exporters.plot(kind='bar', ax=ax1, color='green')
ax1.set_title('Top 10 Exporting Countries (Total over 34 years)')
ax1.set_ylabel('Export (Billions of US$)')
ax1.set_xlabel('Country')
ax1.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax1.tick_params(axis='x', rotation=75)

# Plot top importers
top_importers.plot(kind='bar', ax=ax2, color='blue')
ax2.set_title('Top 10 Importing Countries (Total over 34 years)')
ax2.set_ylabel('Import (Billions of US$)')
ax2.set_xlabel('Country')
ax2.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax2.tick_params(axis='x', rotation=75)

# Plot top balance
top_balance.plot(kind='bar', ax=ax3, color='orange')
ax3.set_title('Top 10 Balance Countries (Total over 34 years)')
ax3.set_ylabel('Total Balance (Billions of US$)')
ax3.set_xlabel('Country')
ax3.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax3.tick_params(axis='x', rotation=75)

# Plot top trade
top_trade.plot(kind='bar', ax=ax4, color='purple')
ax4.set_title('Top 10 Trade Countries (Total over 34 years)')
ax4.set_ylabel('Total Trade (Billions of US$)')
ax4.set_xlabel('Country')
ax4.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax4.tick_params(axis='x', rotation=75)

plt.tight_layout()
plt.show()

### Findings in the information

# From the last calculations and graphs, the values for imports and exports seem inverted.
# We review the source and we found a error in the generation of the data, for this reason we interchange the
# columns names
# Create a mapping for swapping column names
swap_dict = {
    'Export (US$ Thousand)': 'Import (US$ Thousand)',
    'Import (US$ Thousand)': 'Export (US$ Thousand)'
}

# Temporarily rename to avoid conflicts, then assign back the original names swapped
Trade_Dataset = Trade_Dataset.rename(columns={
    'Export (US$ Thousand)': 'TEMP_EXPORT',
    'Import (US$ Thousand)': 'Export (US$ Thousand)'
})
Trade_Dataset = Trade_Dataset.rename(columns={
    'TEMP_EXPORT': 'Import (US$ Thousand)'
})

Trade_Dataset

# Plotting Export and Import Trends Over Time
filtered_data = Trade_Dataset[Trade_Dataset['Partner Name'] != ' World']
world_data = Trade_Dataset[Trade_Dataset['Partner Name'] == ' World']
yearly_world = world_data.groupby('Year')[['Export (US$ Thousand)', 'Import (US$ Thousand)', 'Trade Balance', 'Trade Total']].sum().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_world['Year'], yearly_world['Export (US$ Thousand)'], label='Total World Imports')
plt.plot(yearly_world['Year'], yearly_world['Import (US$ Thousand)'], label='Total World Imports')
plt.plot(yearly_world['Year'], yearly_world['Trade Balance'], label='Total World Trade Balance')
plt.plot(yearly_world['Year'], yearly_world['Trade Total'], label='Total World Trade')
plt.xlabel('Year')
plt.ylabel('Trade Value (US$)')
plt.title('Global Export and Import Trends (1980-2013)')
plt.legend()
plt.grid(True)
plt.show()

# Top countries by export/import
top_exporters = filtered_data.groupby('Partner Name')['Export (US$ Thousand)'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Exporting Countries (Total over 34 years):")
print(top_exporters)

top_importers = filtered_data.groupby('Partner Name')['Import (US$ Thousand)'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Importing Countries (Total over 34 years):")
print(top_importers)

top_balance = filtered_data.groupby('Partner Name')['Trade Balance'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Trade Balance Countries (Total over 34 years):")
print(top_balance)

top_trade = filtered_data.groupby('Partner Name')['Trade Total'].sum().sort_values(ascending=False).head(10)
print("\nTop 10 Trade Total Countries (Total over 34 years):")
print(top_trade)

Top 10 Exporting Countries (Total over 34 years):
Partner Name
Europe & Central Asia         1.388943e+11
East Asia & Pacific           1.062229e+11
North America                 4.311681e+10
China                         4.246660e+10
United States                 3.268715e+10
Germany                       2.848006e+10
Latin America & Caribbean     2.125695e+10
Middle East & North Africa    2.009109e+10
Japan                         1.943478e+10
France                        1.321301e+10
Name: Export (US$ Thousand), dtype: float64

Top 10 Importing Countries (Total over 34 years):
Partner Name
Europe & Central Asia         1.500154e+11
East Asia & Pacific           9.443141e+10
North America                 6.530151e+10
United States                 5.495734e+10
China                         2.826862e+10
Germany                       2.329030e+10
Latin America & Caribbean     2.155302e+10
Middle East & North Africa    2.062328e+10
United Kingdom                1.510597e+10
Japan                         1.453045e+10
Name: Import (US$ Thousand), dtype: float64

Top 10 Trade Balance Countries (Total over 34 years):
Partner Name
United States            2.227019e+10
North America            2.218469e+10
Europe & Central Asia    1.112111e+10
Hong Kong, China         1.038343e+10
Unspecified              7.339505e+09
United Kingdom           4.615490e+09
Switzerland              2.670832e+09
South Asia               2.561655e+09
Turkey                   2.443434e+09
Netherlands              2.096898e+09
Name: Trade Balance, dtype: float64

Top 10 Trade Total Countries (Total over 34 years):
Partner Name
Europe & Central Asia         2.889098e+11
East Asia & Pacific           2.006543e+11
North America                 1.084183e+11
United States                 8.764450e+10
China                         7.073522e+10
Germany                       5.177036e+10
Latin America & Caribbean     4.280997e+10
Middle East & North Africa    4.071438e+10
Japan                         3.396523e+10
France                        2.744588e+10
Name: Trade Total, dtype: float64

# Create a figure with four subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
ax1, ax2, ax3, ax4 = axes.flatten()

# Function to format y-axis ticks in billions (e.g., 2.5B)
def billions(x, pos):
    return f'{x * 1e-9:.1f}B'

# Plot top exporters
top_exporters.plot(kind='bar', ax=ax1, color='green')
ax1.set_title('Top 10 Exporting Countries (Total over 34 years)')
ax1.set_ylabel('Export (Billions of US$)')
ax1.set_xlabel('Country')
ax1.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax1.tick_params(axis='x', rotation=75)

# Plot top importers
top_importers.plot(kind='bar', ax=ax2, color='blue')
ax2.set_title('Top 10 Importing Countries (Total over 34 years)')
ax2.set_ylabel('Import (Billions of US$)')
ax2.set_xlabel('Country')
ax2.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax2.tick_params(axis='x', rotation=75)

# Plot top balance
top_balance.plot(kind='bar', ax=ax3, color='orange')
ax3.set_title('Top 10 Balance Countries (Total over 34 years)')
ax3.set_ylabel('Total Balance (Billions of US$)')
ax3.set_xlabel('Country')
ax3.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax3.tick_params(axis='x', rotation=75)

# Plot top trade
top_trade.plot(kind='bar', ax=ax4, color='purple')
ax4.set_title('Top 10 Trade Countries (Total over 34 years)')
ax4.set_ylabel('Total Trade (Billions of US$)')
ax4.set_xlabel('Country')
ax4.yaxis.set_major_formatter(mticker.FuncFormatter(billions))
ax4.tick_params(axis='x', rotation=75)

plt.tight_layout()
plt.show()

# Identify top 5 trading partners by total trade volume
partner_totals = filtered_data.groupby('Partner Name')[['Export (US$ Thousand)', 'Import (US$ Thousand)', 'Trade Balance', 'Trade Total']].sum()

top_10_partners = partner_totals.nlargest(10, 'Trade Total').index.tolist()

# Create a figure with subplots
fig, axes = plt.subplots(5, 2, figsize=(15, 25))
axes = axes.flatten()

# Plot individual trends for top 10 partners
for i, partner in enumerate(top_10_partners, 0):
    partner_data = filtered_data[filtered_data['Partner Name'] == partner]
    yearly_partner = partner_data.groupby('Year')[['Export (US$ Thousand)', 'Import (US$ Thousand)', 'Trade Balance', 'Trade Total']].sum().reset_index()
    
    # Plot original data
    axes[i].plot(yearly_partner['Year'], yearly_partner['Export (US$ Thousand)'], label='Exports')
    axes[i].plot(yearly_partner['Year'], yearly_partner['Import (US$ Thousand)'], label='Imports')
    axes[i].plot(yearly_partner['Year'], yearly_partner['Trade Balance'], label='Balance')
    axes[i].plot(yearly_partner['Year'], yearly_partner['Trade Total'], label='Total')
    
    # Add trend lines
    # For exports
    z_exp = np.polyfit(yearly_partner['Year'], yearly_partner['Export (US$ Thousand)'], 1)
    p_exp = np.poly1d(z_exp)
    axes[i].plot(yearly_partner['Year'], p_exp(yearly_partner['Year']), "r--", label='Export Trend')
    
    # For imports
    z_imp = np.polyfit(yearly_partner['Year'], yearly_partner['Import (US$ Thousand)'], 1)
    p_imp = np.poly1d(z_imp)
    axes[i].plot(yearly_partner['Year'], p_imp(yearly_partner['Year']), "b--", label='Import Trend')

    # For Balance
    z_exp = np.polyfit(yearly_partner['Year'], yearly_partner['Trade Balance'], 1)
    p_exp = np.poly1d(z_exp)
    axes[i].plot(yearly_partner['Year'], p_exp(yearly_partner['Year']), "r--", label='Balance Trend')
    
    # For Totals
    z_imp = np.polyfit(yearly_partner['Year'], yearly_partner['Trade Total'], 1)
    p_imp = np.poly1d(z_imp)
    axes[i].plot(yearly_partner['Year'], p_imp(yearly_partner['Year']), "b--", label='Total Trend')
    
    
    # Rest of the code remains the same
    axes[i].set_title(f'Trade with {partner}')
    axes[i].set_xlabel('Year')
    axes[i].set_ylabel('Trade Value (US$)')
    axes[i].legend()
    axes[i].grid(True)

plt.tight_layout()
plt.show()

# Review of the correlation between the variables
# Create a figure with 5 rows and 2 columns (10 subplots total) for the top 10 partners

# Get a list of all numeric column names (excluding 'Year' if you wish)
numeric_columns = filtered_data.select_dtypes(include='number').columns.tolist()
# Remove 'Year' if you don't want to include it in the correlation
numeric_columns = [col for col in numeric_columns if col != 'Year']

fig, axes = plt.subplots(5, 2, figsize=(12, 30))  # Larger size for better readability
axes = axes.flatten()  # Convert 2D array of axes to 1D for easier iteration

# Loop through each of the top 10 trading partners
for i, partner in enumerate(top_10_partners):
    # Filter data for the current partner and select only numeric columns
    partner_data = filtered_data[filtered_data['Partner Name'] == partner][numeric_columns].dropna()
    
    # Only create a correlation matrix if we have more than 1 data point
    if len(partner_data) > 1:
        # Calculate the correlation matrix for this partner's data
        corr_matrix = partner_data.corr()
        
        # Create a heatmap visualization of the correlation matrix
        sns.heatmap(
            corr_matrix,
            annot=False,  # Don't show correlation values as text
            cmap='coolwarm',  # Use a blue-red color scheme
            fmt='.2f',  # Format for annotation (not used since annot=False)
            ax=axes[i],  # Place on the current subplot
            cbar=False,  # Don't show the color bar
            square=True,  # Make cells square-shaped
            linewidths=0.5  # Add thin lines between cells
        )
        
        # Set title and format axis labels
        axes[i].set_title(partner, fontsize=14)
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=60, ha='right', fontsize=8)  # Rotate x labels
        axes[i].set_yticklabels(axes[i].get_yticklabels(), rotation=0, fontsize=8)  # Keep y labels horizontal
        axes[i].tick_params(axis='both', which='major', labelsize=8)  # Set tick label size
    else:
        # If insufficient data, turn off the axis and display a message
        axes[i].axis('off')
        axes[i].set_title(f"No data\n{partner}")

# Turn off any unused subplots
for j in range(len(top_10_partners), len(axes)):
    axes[j].axis('off')

# Add an overall title to the figure
fig.suptitle('Correlation Matrices for Top 10 Trading Partners', fontsize=20, y=1.03)
plt.tight_layout(rect=[0, 0.03, 1, 0.98])  # Adjust layout to prevent overlap
plt.show()  # Display the figure

# Remove "World" aggregate if needed
filtered_data = Trade_Dataset[Trade_Dataset['Partner Name'] != ' World']

# Prepare subplots: 5 rows x 2 columns for 10 partners
fig, axes = plt.subplots(5, 2, figsize=(20, 24))
axes = axes.flatten()

for i, partner in enumerate(top_10_partners):
    partner_data = filtered_data[filtered_data['Partner Name'] == partner]
    yearly_summary = partner_data.groupby('Year')[['Export (US$ Thousand)', 'Import (US$ Thousand)', 'Trade Balance', 'Trade Total']].sum()
    
    ax = axes[i]
    yearly_summary['Export (US$ Thousand)'].plot(ax=ax, label='Exports')
    yearly_summary['Import (US$ Thousand)'].plot(ax=ax, label='Imports')
    yearly_summary['Trade Balance'].plot(ax=ax, label='Trade Balance')
    yearly_summary['Trade Total'].plot(ax=ax, label='Trade Total')
    
    # Highlight global events
    ax.axvspan(2008, 2009, color='red', alpha=0.3, label='Financial Crisis')
    ax.axvspan(2014, 2015, color='yellow', alpha=0.3, label='Housing Crisis')
    ax.axvspan(2020, 2021, color='orange', alpha=0.3, label='COVID-19 Pandemic')
    
    ax.set_title(partner, fontsize=14)
    ax.set_xlabel("Year")
    ax.set_ylabel("US$ Thousand")
    ax.grid(True)
    ax.legend(fontsize=10)

# Remove any empty axes if fewer than 10 partners
for j in range(len(top_10_partners), len(axes)):
    axes[j].axis('off')

fig.suptitle('Trade Trends and Major Events for Top 10 Trading Partners', fontsize=20, y=1.01)
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.show()

# Data manipulation libraries
import os
import numpy as np
import pandas as pd

# Statistical analysis libraries
from scipy import stats
from scipy.stats import pearsonr, ttest_ind
import statsmodels.api as sm

# Machine learning libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Load the world export-import dataset from a CSV file into a pandas DataFrame
dts_export_import = pd.read_csv('34_years_world_export_import_dataset.csv')

# Define a list of major trading partners to focus analysis on
partners = [
    'China',
    'United States',
    'Germany',
    'Japan',
    'France',
    'United Kingdom',
    'Switzerland',
    'Turkey',
    'Netherlands'
]

# Filter the trade dataset to include only the specified partners
# Exclude 'World' aggregate data and keep only rows where Partner Name is in our list
filtered_data = dts_export_import[
    (dts_export_import['Partner Name'] != ' World') &
    (dts_export_import['Partner Name'].isin(partners))
]

# Create a new dataframe with selected columns from filtered_data
Trade_Dataset = filtered_data[[
'Partner Name',                       
'Year',                    
'Export (US$ Thousand)',               
'Import (US$ Thousand)',
'AHS Total Tariff Lines',
'MFN Total Tariff Lines',
'AHS Duty Free Tariff Lines Share (%)'
]].copy()

# Calculate trade balance (exports minus imports)
Trade_Dataset['Trade Balance'] = Trade_Dataset['Export (US$ Thousand)'] - Trade_Dataset['Import (US$ Thousand)']
# Calculate total trade volume (exports plus imports)
Trade_Dataset['Trade Total'] = Trade_Dataset['Export (US$ Thousand)'] + Trade_Dataset['Import (US$ Thousand)']

# Remove rows with missing values
Trade_Dataset = Trade_Dataset.dropna()

# Remove duplicate rows from the dataset
Trade_Dataset = Trade_Dataset.drop_duplicates()

### Swapping Columns for Export and Import detected in the exploration

# From the last calculations and graphs, the values for imports and exports seem inverted.
# We review the source and we found a error in the generation of the data, for this reason we interchange the
# columns names
# Create a mapping for swapping column names
swap_dict = {
    'Export (US$ Thousand)': 'Import (US$ Thousand)',
    'Import (US$ Thousand)': 'Export (US$ Thousand)'
}

# Temporarily rename to avoid conflicts, then assign back the original names swapped
Trade_Dataset = Trade_Dataset.rename(columns={
    'Export (US$ Thousand)': 'TEMP_EXPORT',
    'Import (US$ Thousand)': 'Export (US$ Thousand)'
})
Trade_Dataset = Trade_Dataset.rename(columns={
    'TEMP_EXPORT': 'Import (US$ Thousand)'
})

def remove_iqr_outliers(df, columns):
    """
    Remove outliers from dataframe using the Interquartile Range (IQR) method.
    
    Args:
        df (DataFrame): Input dataframe
        columns (list): List of column names to check for outliers
        
    Returns:
        DataFrame: Dataframe with outliers removed
    """
    for col in columns:
        # Calculate first quartile (25th percentile)
        Q1 = df[col].quantile(0.25)
        # Calculate third quartile (75th percentile)
        Q3 = df[col].quantile(0.75)
        # Calculate interquartile range
        IQR = Q3 - Q1
        # Define lower and upper bounds for outliers (1.5 * IQR rule)
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Filter dataframe to keep only values within bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def remove_zscore_outliers(df, columns, threshold=3):
    """
    Remove outliers from dataframe using the Z-score method.
    
    Args:
        df (DataFrame): Input dataframe
        columns (list): List of column names to check for outliers
        threshold (float): Z-score threshold (default: 3)
        
    Returns:
        DataFrame: Dataframe with outliers removed
    """
    # Calculate absolute Z-scores for specified columns
    z_scores = np.abs(stats.zscore(df[columns]))
    # Create mask where all Z-scores are below threshold
    mask = (z_scores < threshold).all(axis=1)
    # Return filtered dataframe
    return df[mask]

# Define numeric columns to check for outliers
numeric_cols = [
    'Export (US$ Thousand)', 
    'Import (US$ Thousand)',
    'AHS Total Tariff Lines',
    'MFN Total Tariff Lines',
    'AHS Duty Free Tariff Lines Share (%)',
    'Trade Balance', 
    'Trade Total'
]

# List to store cleaned dataframes for each partner
all_cleaned = []

# Process each partner separately
for partner in partners:
    # Filter data for current partner
    partner_data = Trade_Dataset[Trade_Dataset['Partner Name'] == partner]
    print("Shape before outlier removal:"+ partner +" "+ str(partner_data.shape))

    # Apply two-stage outlier removal process
    # First remove IQR outliers
    cleaned_iqr = remove_iqr_outliers(partner_data, numeric_cols)
    # Then remove Z-score outliers from the IQR-cleaned data
    cleaned_zscore = remove_zscore_outliers(cleaned_iqr, numeric_cols)
    # Add cleaned partner data to our collection
    all_cleaned.append(cleaned_zscore)
    
    print("Shape after outlier removal:"+ partner +" "+str(cleaned_zscore.shape))

# Combine all cleaned partner dataframes into a single dataframe
Trade_Dataset_no_outliers = pd.concat(all_cleaned, ignore_index=True)

print("Shape after outlier removal (all partners combined):", Trade_Dataset_no_outliers.shape)

Shape before outlier removal:China (34, 9)
Shape after outlier removal:China (28, 9)
Shape before outlier removal:United States (34, 9)
Shape after outlier removal:United States (34, 9)
Shape before outlier removal:Germany (34, 9)
Shape after outlier removal:Germany (34, 9)
Shape before outlier removal:Japan (34, 9)
Shape after outlier removal:Japan (29, 9)
Shape before outlier removal:France (34, 9)
Shape after outlier removal:France (34, 9)
Shape before outlier removal:United Kingdom (34, 9)
Shape after outlier removal:United Kingdom (31, 9)
Shape before outlier removal:Switzerland (34, 9)
Shape after outlier removal:Switzerland (34, 9)
Shape before outlier removal:Turkey (34, 9)
Shape after outlier removal:Turkey (34, 9)
Shape before outlier removal:Netherlands (34, 9)
Shape after outlier removal:Netherlands (34, 9)
Shape after outlier removal (all partners combined): (292, 9)

os.makedirs("H1_Regression_Models", exist_ok=True)  # Create a directory for saved models
for i, partner in enumerate(partners):
    # Filter dataset for the current partner
    partner_data = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Partner Name'] == partner]

    # Define columns for tariff analysis
    tariff_cols = ['AHS Total Tariff Lines', 'MFN Total Tariff Lines']
    trade_balance_col = 'Trade Balance'

    print("\n==========================================")
    print(f"Partner: {partner}")
    
    # Pearson Correlations
    print("Pearson Correlation:")
    for col in tariff_cols:
        corr, pval = pearsonr(partner_data[col], partner_data[trade_balance_col])
        print(f"  {col} vs Trade Balance: r = {corr:.3f}, p = {pval:.3f}")

    for col in tariff_cols:
        print(f"\n--- Regression Analysis: {col} vs Trade Balance ---")
        X = partner_data[[col]]
        y = partner_data[trade_balance_col]

        # Train/Val/Test Split
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        # Fit Linear Regression
        model = LinearRegression().fit(X_train, y_train)

        # Evaluation on Validation set
        y_pred_val = model.predict(X_val)
        print("Validation:")
        print(f"  RMSE: {root_mean_squared_error(y_val, y_pred_val):.2f}")
        print(f"  MAE: {mean_absolute_error(y_val, y_pred_val):.2f}")
        print(f"  R^2: {r2_score(y_val, y_pred_val):.3f}")

        # Final Test set evaluation
        y_pred_test = model.predict(X_test)
        print("Test:")
        print(f"  RMSE: {root_mean_squared_error(y_test, y_pred_test):.2f}")
        print(f"  MAE: {mean_absolute_error(y_test, y_pred_test):.2f}")
        print(f"  R^2: {r2_score(y_test, y_pred_test):.3f}")

        # Save model
        model_filename = f"H1_Regression_Models/{partner.replace(' ', '_')}_{col.replace(' ', '_').lower()}_model.joblib"
        joblib.dump(model, model_filename)
        print(f"Model saved as {model_filename}")

        # Visualization
        plt.figure(figsize=(8,5))
        sns.regplot(x=col, y=trade_balance_col, data=partner_data, ci=None)
        plt.title(f"Linear Regression: {col} vs Trade Balance ({partner})")
        plt.xlabel(col)
        plt.ylabel("Trade Balance")
        plt.show()

==========================================
Partner: China
Pearson Correlation:
  AHS Total Tariff Lines vs Trade Balance: r = -0.933, p = 0.000
  MFN Total Tariff Lines vs Trade Balance: r = -0.860, p = 0.000

--- Regression Analysis: AHS Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 84645061.39
  MAE: 60266570.44
  R^2: 0.950
Test:
  RMSE: 146732519.70
  MAE: 128542684.28
  R^2: 0.792
Model saved as H1_Regression_Models/China_ahs_total_tariff_lines_model.joblib

--- Regression Analysis: MFN Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 106451786.33
  MAE: 77614268.99
  R^2: 0.921
Test:
  RMSE: 250576920.62
  MAE: 223969315.14
  R^2: 0.394
Model saved as H1_Regression_Models/China_mfn_total_tariff_lines_model.joblib

==========================================
Partner: United States
Pearson Correlation:
  AHS Total Tariff Lines vs Trade Balance: r = 0.919, p = 0.000
  MFN Total Tariff Lines vs Trade Balance: r = 0.925, p = 0.000

--- Regression Analysis: AHS Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 223957365.13
  MAE: 207919742.82
  R^2: 0.708
Test:
  RMSE: 125986483.37
  MAE: 107486890.95
  R^2: 0.905
Model saved as H1_Regression_Models/United_States_ahs_total_tariff_lines_model.joblib

--- Regression Analysis: MFN Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 199587841.23
  MAE: 182883606.35
  R^2: 0.768
Test:
  RMSE: 129671040.71
  MAE: 110860495.82
  R^2: 0.899
Model saved as H1_Regression_Models/United_States_mfn_total_tariff_lines_model.joblib

==========================================
Partner: Germany
Pearson Correlation:
  AHS Total Tariff Lines vs Trade Balance: r = -0.837, p = 0.000
  MFN Total Tariff Lines vs Trade Balance: r = -0.904, p = 0.000

--- Regression Analysis: AHS Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 73857598.36
  MAE: 65589302.58
  R^2: 0.367
Test:
  RMSE: 52890439.96
  MAE: 48340094.99
  R^2: 0.775
Model saved as H1_Regression_Models/Germany_ahs_total_tariff_lines_model.joblib

--- Regression Analysis: MFN Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 64038989.78
  MAE: 59361187.43
  R^2: 0.524
Test:
  RMSE: 35117752.99
  MAE: 32088501.13
  R^2: 0.901
Model saved as H1_Regression_Models/Germany_mfn_total_tariff_lines_model.joblib

==========================================
Partner: Japan
Pearson Correlation:
  AHS Total Tariff Lines vs Trade Balance: r = -0.164, p = 0.395
  MFN Total Tariff Lines vs Trade Balance: r = -0.129, p = 0.506

--- Regression Analysis: AHS Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 35001625.96
  MAE: 27896861.29
  R^2: -0.285
Test:
  RMSE: 34633329.90
  MAE: 26707419.36
  R^2: 0.130
Model saved as H1_Regression_Models/Japan_ahs_total_tariff_lines_model.joblib

--- Regression Analysis: MFN Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 34765627.28
  MAE: 27667334.22
  R^2: -0.268
Test:
  RMSE: 35420305.64
  MAE: 27507690.76
  R^2: 0.090
Model saved as H1_Regression_Models/Japan_mfn_total_tariff_lines_model.joblib

==========================================
Partner: France
Pearson Correlation:
  AHS Total Tariff Lines vs Trade Balance: r = 0.745, p = 0.000
  MFN Total Tariff Lines vs Trade Balance: r = 0.789, p = 0.000

--- Regression Analysis: AHS Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 15165415.22
  MAE: 13104652.95
  R^2: 0.680
Test:
  RMSE: 24459007.10
  MAE: 22372128.52
  R^2: 0.116
Model saved as H1_Regression_Models/France_ahs_total_tariff_lines_model.joblib

--- Regression Analysis: MFN Total Tariff Lines vs Trade Balance ---
Validation:
  RMSE: 14204233.60
  MAE: 11495604.18
  R^2: 0.719
Test:
  RMSE: 21763363.94
  MAE: 19245940.64
  R^2: 0.300
Model saved as H1_Regression_Models/France_mfn_total_tariff_lines_model.joblib

os.makedirs("H2_RandomForest_models", exist_ok=True)

for i, partner in enumerate(partners):
    # Filter dataset for the current partner
    partner_data = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Partner Name'] == partner]

    # Prepare features (X) and target variable (y)
    X = partner_data[['AHS Duty Free Tariff Lines Share (%)']].values
    y = partner_data['Export (US$ Thousand)'].values

    # Split data: first train/test+val, then split test+val into test and val
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    # Now: 60% train, 20% val, 20% test

    # Initialize and train Random Forest Regressor model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Make predictions for validation and test sets
    y_pred_val = rf.predict(X_val)
    y_pred_test = rf.predict(X_test)

    # Evaluate model performance (validation)
    val_rmse = root_mean_squared_error(y_val, y_pred_val)
    val_mae = mean_absolute_error(y_val, y_pred_val)
    val_r2 = r2_score(y_val, y_pred_val)

    # Evaluate model performance (test)
    test_rmse = root_mean_squared_error(y_test, y_pred_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)

    print(f"\nRandom Forest Analysis for: {partner}")
    print("Validation Set:")
    print(f"  RMSE: {val_rmse:.2f}")
    print(f"  MAE: {val_mae:.2f}")
    print(f"  R^2: {val_r2:.3f}")
    print("Test Set:")
    print(f"  RMSE: {test_rmse:.2f}")
    print(f"  MAE: {test_mae:.2f}")
    print(f"  R^2: {test_r2:.3f}")

    # Generate prediction line across the range of duty-free percentages
    duty_free_range = np.linspace(
        partner_data['AHS Duty Free Tariff Lines Share (%)'].min(),
        partner_data['AHS Duty Free Tariff Lines Share (%)'].max(),
        100
    ).reshape(-1, 1)
    predicted_exports = rf.predict(duty_free_range)

    # Visualization
    plt.figure(figsize=(8,5))
    plt.scatter(X, y, alpha=0.2, label='Actual Data')
    plt.plot(duty_free_range, predicted_exports, color='red', linewidth=2, label='RF Prediction')
    plt.xlabel('AHS Duty Free Tariff Lines Share (%)')
    plt.ylabel('Export (US$ Thousand)')
    plt.title(f'Random Forest: Duty-Free Share vs. Export Volume for {partner}')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Feature importance
    print("  Feature Importance:", rf.feature_importances_)

    # Save the model
    model_path = f"H2_RandomForest_models/{partner.replace(' ', '_').replace('/', '_')}_rf_model.joblib"
    joblib.dump(rf, model_path)
    print(f"  Model saved to {model_path}")

Random Forest Analysis for: China
Validation Set:
  RMSE: 1759556447.74
  MAE: 1508054601.90
  R^2: -1.470
Test Set:
  RMSE: 460812398.23
  MAE: 374897786.43
  R^2: -2.150

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/China_rf_model.joblib

Random Forest Analysis for: United States
Validation Set:
  RMSE: 315896653.62
  MAE: 283219389.53
  R^2: 0.439
Test Set:
  RMSE: 234934711.71
  MAE: 174679659.92
  R^2: 0.670

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/United_States_rf_model.joblib

Random Forest Analysis for: Germany
Validation Set:
  RMSE: 434334918.17
  MAE: 224479733.05
  R^2: 0.050
Test Set:
  RMSE: 157521001.29
  MAE: 138489918.04
  R^2: 0.849

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/Germany_rf_model.joblib

Random Forest Analysis for: Japan
Validation Set:
  RMSE: 94039635.34
  MAE: 79827423.60
  R^2: 0.846
Test Set:
  RMSE: 67315906.28
  MAE: 56729710.03
  R^2: 0.697

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/Japan_rf_model.joblib

Random Forest Analysis for: France
Validation Set:
  RMSE: 189758577.18
  MAE: 106995583.94
  R^2: -0.088
Test Set:
  RMSE: 72876502.95
  MAE: 66371251.46
  R^2: 0.730

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/France_rf_model.joblib

Random Forest Analysis for: United Kingdom
Validation Set:
  RMSE: 62899910.47
  MAE: 54005123.44
  R^2: 0.829
Test Set:
  RMSE: 168420048.35
  MAE: 121907498.44
  R^2: -0.890

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/United_Kingdom_rf_model.joblib

Random Forest Analysis for: Switzerland
Validation Set:
  RMSE: 71425083.88
  MAE: 58213545.54
  R^2: 0.412
Test Set:
  RMSE: 38389412.39
  MAE: 27626667.72
  R^2: 0.878

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/Switzerland_rf_model.joblib

Random Forest Analysis for: Turkey
Validation Set:
  RMSE: 44462998.95
  MAE: 37098555.76
  R^2: 0.156
Test Set:
  RMSE: 37926189.39
  MAE: 27765489.57
  R^2: 0.530

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/Turkey_rf_model.joblib

Random Forest Analysis for: Netherlands
Validation Set:
  RMSE: 171980037.21
  MAE: 97588430.07
  R^2: -0.322
Test Set:
  RMSE: 75876798.72
  MAE: 62119762.08
  R^2: 0.730

  Feature Importance: [1.]
  Model saved to H2_RandomForest_models/Netherlands_rf_model.joblib

# Map years to event names
event_dict = {
    2008: 'Financial Crisis',
    2009: 'Financial Crisis',
    2014: 'Housing Crisis',
    2015: 'Housing Crisis',
    2020: 'COVID-19 Pandemic',
    2021: 'COVID-19 Pandemic'
}

# Create 'Global Event' column
Trade_Dataset_no_outliers['Global Event'] = Trade_Dataset_no_outliers['Year'].map(event_dict).fillna('Non-Event')

# Create binary column for T-test: 1 if event, 0 if non-event
Trade_Dataset_no_outliers['Is Event'] = Trade_Dataset_no_outliers['Global Event'].apply(lambda x: 0 if x == 'Non-Event' else 1)

trade_col = 'Export (US$ Thousand)'

# Split data
event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 1][trade_col].dropna()
non_event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 0][trade_col].dropna()

# T-test (Welch's)
t_stat, p_val = ttest_ind(event_trades, non_event_trades, equal_var=False)
print(f"T-test statistic: {t_stat:.3f}")
print(f"P-value: {p_val:.4f}")

# Boxplot by event group
plt.figure(figsize=(9,5))
sns.boxplot(x='Global Event', y=trade_col, data=Trade_Dataset_no_outliers, order=['Financial Crisis', 'Housing Crisis', 'COVID-19 Pandemic', 'Non-Event'])
plt.ylabel(trade_col)
plt.xlabel('Period')
plt.title('Export Trade Volumes by Global Event')
plt.tight_layout()
plt.show()

trade_col = 'Import (US$ Thousand)'

# Split data
event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 1][trade_col].dropna()
non_event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 0][trade_col].dropna()

# T-test (Welch's)
t_stat, p_val = ttest_ind(event_trades, non_event_trades, equal_var=False)
print(f"T-test statistic: {t_stat:.3f}")
print(f"P-value: {p_val:.4f}")

# Boxplot by event group
plt.figure(figsize=(9,5))
sns.boxplot(x='Global Event', y=trade_col, data=Trade_Dataset_no_outliers, order=['Financial Crisis', 'Housing Crisis', 'COVID-19 Pandemic', 'Non-Event'])
plt.ylabel(trade_col)
plt.xlabel('Period')
plt.title('Import Trade Volumes by Global Event')
plt.tight_layout()
plt.show()

trade_col = 'Trade Balance'

# Split data
event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 1][trade_col].dropna()
non_event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 0][trade_col].dropna()

# T-test (Welch's)
t_stat, p_val = ttest_ind(event_trades, non_event_trades, equal_var=False)
print(f"T-test statistic: {t_stat:.3f}")
print(f"P-value: {p_val:.4f}")

# Boxplot by event group
plt.figure(figsize=(9,5))
sns.boxplot(x='Global Event', y=trade_col, data=Trade_Dataset_no_outliers, order=['Financial Crisis', 'Housing Crisis', 'COVID-19 Pandemic', 'Non-Event'])
plt.ylabel(trade_col)
plt.xlabel('Period')
plt.title('Trade Balance Volumes by Global Event')
plt.tight_layout()
plt.show()


trade_col = 'Trade Total'

# Split data
event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 1][trade_col].dropna()
non_event_trades = Trade_Dataset_no_outliers[Trade_Dataset_no_outliers['Is Event'] == 0][trade_col].dropna()

# T-test (Welch's)
t_stat, p_val = ttest_ind(event_trades, non_event_trades, equal_var=False)
print(f"T-test statistic: {t_stat:.3f}")
print(f"P-value: {p_val:.4f}")

# Boxplot by event group
plt.figure(figsize=(9,5))
sns.boxplot(x='Global Event', y=trade_col, data=Trade_Dataset_no_outliers, order=['Financial Crisis', 'Housing Crisis', 'COVID-19 Pandemic', 'Non-Event'])
plt.ylabel(trade_col)
plt.xlabel('Period')
plt.title('Trade Total Volumes by Global Event')
plt.tight_layout()
plt.show()

T-test statistic: 3.726
P-value: 0.0005

T-test statistic: 3.981
P-value: 0.0002

T-test statistic: 0.517
P-value: 0.6075

T-test statistic: 4.082
P-value: 0.0001

	Partner Name	Year	Export (US$ Thousand)	Import (US$ Thousand)	AHS Simple Average (%)	AHS Weighted Average (%)	AHS Total Tariff Lines	AHS Dutiable Imports (US$ Thousand)	AHS Duty Free Imports (US$ Thousand)	MFN Simple Average (%)	MFN Weighted Average (%)	MFN Total Tariff Lines	MFN Dutiable Imports (US$ Thousand)	MFN Duty Free Imports (US$ Thousand)	Trade Balance	Trade Total
0	Aruba	1988	3.498100e+03	3.284900e+02	2.80	2.92	155.0	2.346370e+03	781.65	13.59	8.46	1152.0	3.128020e+03	0.00	3.169610e+03	3.826590e+03
1	Afghanistan	1988	2.130304e+05	5.445952e+04	0.88	1.83	548.0	7.020413e+04	23987.37	17.68	12.43	4142.0	9.419150e+04	0.00	1.585709e+05	2.674899e+05
2	Angola	1988	3.755279e+05	3.707028e+05	2.02	3.89	633.0	7.541838e+05	167297.68	12.70	6.14	5438.0	9.214815e+05	0.00	4.825130e+03	7.462307e+05
3	Anguila	1988	3.669800e+02	4.000000e+00	3.71	1.09	33.0	6.500000e+01	518.00	16.63	14.75	322.0	5.830000e+02	0.00	3.629800e+02	3.709800e+02
4	Albania	1988	3.010356e+04	4.770930e+04	1.84	2.38	744.0	6.229453e+04	38901.42	19.20	9.68	5684.0	1.011959e+05	0.00	-1.760574e+04	7.781286e+04
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8091	Latin America & Caribbean	2021	1.330557e+09	1.310305e+09	3.84	6.71	1688963.0	1.339963e+09	6778449.31	8.77	9.32	1509731.0	1.346740e+09	1632.05	2.025145e+07	2.640862e+09
8092	Middle East & North Africa	2021	1.196712e+09	1.088471e+09	4.63	2.14	1445742.0	1.116203e+09	1105646.72	8.56	4.29	1507304.0	1.117309e+09	0.00	1.082410e+08	2.285184e+09
8093	North America	2021	3.823319e+09	2.219849e+09	6.45	4.63	1144964.0	2.053437e+09	2539310.69	8.92	6.30	1533357.0	2.055953e+09	23691.96	1.603469e+09	6.043168e+09
8094	South Asia	2021	6.991380e+08	4.723832e+08	5.09	5.96	969270.0	5.109348e+08	1384185.04	8.50	9.84	1508290.0	5.123190e+08	0.05	2.267548e+08	1.171521e+09
8095	Sub-Saharan Africa	2021	4.951000e+08	4.350468e+08	3.22	1.82	961981.0	4.907281e+08	871108.71	8.60	5.41	1513840.0	4.915992e+08	2.83	6.005324e+07	9.301468e+08

	Partner Name	Year	Import (US$ Thousand)	Export (US$ Thousand)	AHS Simple Average (%)	AHS Weighted Average (%)	AHS Total Tariff Lines	AHS Dutiable Imports (US$ Thousand)	AHS Duty Free Imports (US$ Thousand)	MFN Simple Average (%)	MFN Weighted Average (%)	MFN Total Tariff Lines	MFN Dutiable Imports (US$ Thousand)	MFN Duty Free Imports (US$ Thousand)	Trade Balance	Trade Total
0	Aruba	1988	3.498100e+03	3.284900e+02	2.80	2.92	155.0	2.346370e+03	781.65	13.59	8.46	1152.0	3.128020e+03	0.00	3.169610e+03	3.826590e+03
1	Afghanistan	1988	2.130304e+05	5.445952e+04	0.88	1.83	548.0	7.020413e+04	23987.37	17.68	12.43	4142.0	9.419150e+04	0.00	1.585709e+05	2.674899e+05
2	Angola	1988	3.755279e+05	3.707028e+05	2.02	3.89	633.0	7.541838e+05	167297.68	12.70	6.14	5438.0	9.214815e+05	0.00	4.825130e+03	7.462307e+05
3	Anguila	1988	3.669800e+02	4.000000e+00	3.71	1.09	33.0	6.500000e+01	518.00	16.63	14.75	322.0	5.830000e+02	0.00	3.629800e+02	3.709800e+02
4	Albania	1988	3.010356e+04	4.770930e+04	1.84	2.38	744.0	6.229453e+04	38901.42	19.20	9.68	5684.0	1.011959e+05	0.00	-1.760574e+04	7.781286e+04
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8091	Latin America & Caribbean	2021	1.330557e+09	1.310305e+09	3.84	6.71	1688963.0	1.339963e+09	6778449.31	8.77	9.32	1509731.0	1.346740e+09	1632.05	2.025145e+07	2.640862e+09
8092	Middle East & North Africa	2021	1.196712e+09	1.088471e+09	4.63	2.14	1445742.0	1.116203e+09	1105646.72	8.56	4.29	1507304.0	1.117309e+09	0.00	1.082410e+08	2.285184e+09
8093	North America	2021	3.823319e+09	2.219849e+09	6.45	4.63	1144964.0	2.053437e+09	2539310.69	8.92	6.30	1533357.0	2.055953e+09	23691.96	1.603469e+09	6.043168e+09
8094	South Asia	2021	6.991380e+08	4.723832e+08	5.09	5.96	969270.0	5.109348e+08	1384185.04	8.50	9.84	1508290.0	5.123190e+08	0.05	2.267548e+08	1.171521e+09
8095	Sub-Saharan Africa	2021	4.951000e+08	4.350468e+08	3.22	1.82	961981.0	4.907281e+08	871108.71	8.60	5.41	1513840.0	4.915992e+08	2.83	6.005324e+07	9.301468e+08

University of Niagara Falls, Canada¶

Master's Data Analytics¶

Spring 2025 Data Analytics Case Study 3 (DAMO-611-6)¶

Group 6¶

Professor Zeeshan Ahmad¶

Case Study Part 1¶

Dataset:¶

Source:¶

Phase 1: Problem Definition, Research Question, Hypothesis Formulation, Data Collection¶

Phase 2 and 3: Data Understanding and Data Visualization:¶

Problem Statement:¶

Libraries Importation¶

Import Dataset and Initial Exploration¶

Cleaning Data¶

Preprocesing Data¶

Data Visualization¶

Trends in the information¶

Correlation in the variables¶

Insighs with external events¶

Case Study Part 2¶

Dataset:¶

Source:¶

Phase 4 and 5: Model Building and Model Evaluation:¶

Libraries Importation¶

Import Dataset¶

Initial Filter with the countries to analyze¶

Creation and preprocessing of the work dataset¶

Checking and handling outliers for each country separately¶

H1: A country’s average tariff rates (AHS and MFN) and its trade balance (exports minus imports) from 1988 to 2021 have a statistically significant negative correlation¶

H2: Countries with a higher proportion of duty-free tariff lines have significantly higher export volumes compared to countries with lower duty-free shares.¶

H3: Major global events (such as financial crises and pandemics) are associated with statistically significant changes in global trade volumes, as observed in year-over-year export and import data¶